if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, broom, skimr, devtools, ggpubr, mgcv, extrafont, mgcViz, here)
theme_set(theme_pubclean(base_size = 14))
# load the function to print GAM figures
source(here("R", "p_gam.R"))
# import
lesion <- read_csv(here("cache", "summarised_lesion_data.csv"))
weather <- read_csv(here("cache", "weather_summary.csv"))
dat <- left_join(lesion, weather, by = c("site" = "Location", "rep" = "Rep"))
For reproducibility purposes, use set.seed().
set.seed(27)
mod1 <-
gam(
mean_pot_count ~ s(distance, k = 5),
data = dat
)
summary(mod1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0802 0.0475 22.7 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 4 78.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.482 Deviance explained = 48.8%
## GCV = 0.76522 Scale est. = 0.75394 n = 334
print(p_gam(x = getViz(mod1)) +
ggtitle("s(Distance)"),
pages = 1)
mod2 <-
gam(
mean_pot_count ~ sum_rain+ s(distance, k = 5),
data = dat
)
summary(mod2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.11372 0.05794 19.22 <0.0000000000000002 ***
## sum_rain -0.00740 0.00733 -1.01 0.31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 4 78.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.482 Deviance explained = 48.9%
## GCV = 0.76751 Scale est. = 0.75389 n = 334
print(p_gam(x = getViz(mod2)) +
ggtitle("s(Distance) + Precipitation"),
pages = 1)
mod3 <-
gam(mean_pot_count ~ mws + s(distance, k = 5),
data = dat)
summary(mod3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6855 0.1093 6.27 0.0000000011 ***
## mws 0.1213 0.0304 3.99 0.0000823409 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 4 81.9 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.504 Deviance explained = 51.1%
## GCV = 0.7343 Scale est. = 0.72127 n = 334
print(p_gam(x = getViz(mod3)) +
ggtitle("s(Distance) + Windspeed"),
pages = 1)
mod4 <-
gam(mean_pot_count ~ sum_rain + mws + s(distance, k = 5),
data = dat)
summary(mod4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.71328 0.11019 6.47 0.00000000035 ***
## sum_rain -0.01255 0.00725 -1.73 0.085 .
## mws 0.13019 0.03076 4.23 0.00003000169 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 4 82.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.507 Deviance explained = 51.6%
## GCV = 0.7321 Scale est. = 0.71691 n = 334
print(p_gam(x = getViz(mod4)) +
ggtitle("s(Distance) + Windspeed + Precipitation"),
pages = 1)
mod5 <-
gam(
mean_pot_count ~ sum_rain + s(distance + mws, k = 5),
data = dat
)
## Warning in term[i] <- attr(terms(reformulate(term[i])), "term.labels"): number
## of items to replace is not a multiple of replacement length
summary(mod5)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.11372 0.05794 19.22 <0.0000000000000002 ***
## sum_rain -0.00740 0.00733 -1.01 0.31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 4 78.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.482 Deviance explained = 48.9%
## GCV = 0.76751 Scale est. = 0.75389 n = 334
print(p_gam(x = getViz(mod5)) +
ggtitle("s(Distance + Windspeed) + Precipitation"),
pages = 1)
mod6 <-
gam(
mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod6)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5192 0.2687 1.93 0.054 .
## sum_rain 0.1241 0.0586 2.12 0.035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.94 4.00 93.8 < 0.0000000000000002 ***
## s(mws) 3.83 3.97 17.2 0.0000000000014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.6495 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod6)) +
ggtitle("s(Distance) + s(Windspeed) + Precipitation"),
pages = 1)
mod7 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod7)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0802 0.0437 24.7 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.94 4.00 92.6 < 0.0000000000000002 ***
## s(mws) 3.27 3.62 16.2 0.000000000033 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.561 Deviance explained = 57%
## GCV = 0.65512 Scale est. = 0.63903 n = 334
print(p_gam(x = getViz(mod7)) +
ggtitle("s(Distance) + s(Windspeed)"),
pages = 1)
mod8 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod8)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0802 0.0434 24.9 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.94 4.00 93.81 < 0.0000000000000002 ***
## s(mws) 3.83 3.97 17.18 0.0000000000014 ***
## s(sum_rain) 1.00 1.00 4.47 0.035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.6495 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod8)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation)"),
pages = 1)
mod9 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod9)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0802 0.0434 24.9 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.94 4 94.1 < 0.0000000000000002 ***
## s(sum_rain) 3.95 4 17.6 0.00000000000092 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.568 Deviance explained = 57.8%
## GCV = 0.64583 Scale est. = 0.62864 n = 334
print(p_gam(x = getViz(mod9)) +
ggtitle("s(Distance) + s(Precipitation)"),
pages = 1)
mod10 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws,
data = dat
)
summary(mod10)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.04950 0.12494 8.40 0.0000000000000014 ***
## mws 0.00945 0.03599 0.26 0.79
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.94 4 93.8 < 0.0000000000000002 ***
## s(sum_rain) 3.95 4 12.7 0.0000000016 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64973 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod10)) +
ggtitle("s(Distance) + s(Precipitation) + Windspeed"),
pages = 1)
This is the same as mod8 but using family = tw(), see ?family.mgcv for more on the families. The Tweedie distribution is used where the distribution has a positive mass at zero, but is continuous unlike the Poisson distribution that requires count data. The data visualisation shows clearly that the mean pot count data have this shape.
mod11.0 <-
gam(
mean_pot_count ~ s(distance, k = 5) +
s(mws, k = 5) +
s(sum_rain, k = 5),
data = dat,
family = tw()
)
summary(mod11.0)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.228 0.041 -5.57 0.000000053 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.50 3.85 123.78 < 0.0000000000000002 ***
## s(mws) 1.98 2.01 10.23 0.000037 ***
## s(sum_rain) 2.86 2.97 4.48 0.003 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.674 Deviance explained = 61.2%
## -REML = 309.53 Scale est. = 0.36395 n = 334
print(p_gam(x = getViz(mod11.0)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation), family = tw()"),
pages = 1)
mod11.1 <-
gam(
mean_pot_count ~ s(distance, k = 5, bs = "ts") +
s(mws, k = 5, bs = "ts") +
s(sum_rain, k = 5, bs = "ts"),
data = dat,
family = tw()
)
summary(mod11.1)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2239 0.0409 -5.47 0.000000088 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.248778 4 117.8 <0.0000000000000002 ***
## s(mws) 3.354524 4 23.3 <0.0000000000000002 ***
## s(sum_rain) 0.000176 4 0.0 0.57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.663 Deviance explained = 60.4%
## -REML = 315.48 Scale est. = 0.36466 n = 334
print(
p_gam(x = getViz(mod11.1)) +
ggtitle(
"s(Distance, bs = 'ts') + s(Windspeed, bs = 'ts')\n+ s(Precipitation, bs = 'ts'), family = tw()"
),
pages = 1
)
This model, same structure as mod11.0, uses thin-plate splines to shrink the coefficients of the smooth to zero when possible.
models <- list(mod1 = mod1,
mod2 = mod2,
mod3 = mod3,
mod4 = mod4,
mod5 = mod5,
mod6 = mod6,
mod7 = mod7,
mod8 = mod8,
mod9 = mod9,
mod10 = mod10,
mod11.0 = mod11.0,
mod11.1 = mod11.1
)
map_df(models, glance, .id = "model") %>%
arrange(AIC)
## # A tibble: 12 x 7
## model df logLik AIC BIC deviance df.residual
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mod11.0 9.34 -320. 663. 708. 140. 325.
## 2 mod11.1 7.60 -330. 681. 720. 144. 326.
## 3 mod9 8.89 -392. 804. 841. 204. 325.
## 4 mod6 9.77 -392. 805. 846. 204. 324.
## 5 mod8 9.76 -392. 805. 846. 204. 324.
## 6 mod10 9.88 -392. 806. 847. 204. 324.
## 7 mod7 8.20 -395. 808. 843. 208. 326.
## 8 mod4 6.93 -415. 846. 876. 234. 327.
## 9 mod3 5.93 -416. 847. 873. 237. 328.
## 10 mod1 4.93 -424. 860. 883. 248. 329.
## 11 mod2 5.93 -424. 861. 888. 247. 328.
## 12 mod5 5.93 -424. 861. 888. 247. 328.
enframe(c(
mod1 = summary(mod1)$r.sq,
mod2 = summary(mod2)$r.sq,
mod3 = summary(mod3)$r.sq,
mod4 = summary(mod4)$r.sq,
mod5 = summary(mod5)$r.sq,
mod6 = summary(mod6)$r.sq,
mod7 = summary(mod7)$r.sq,
mod8 = summary(mod8)$r.sq,
mod9 = summary(mod9)$r.sq,
mod10 = summary(mod10)$r.sq,
mod11.0 = summary(mod11.0)$r.sq,
mod11.1 = summary(mod11.1)$r.sq
)) %>%
arrange(desc(value))
## # A tibble: 12 x 2
## name value
## <chr> <dbl>
## 1 mod11.0 0.674
## 2 mod11.1 0.663
## 3 mod9 0.568
## 4 mod10 0.566
## 5 mod6 0.566
## 6 mod8 0.566
## 7 mod7 0.561
## 8 mod4 0.507
## 9 mod3 0.504
## 10 mod2 0.482
## 11 mod5 0.482
## 12 mod1 0.482
anova(mod1,
mod2,
mod3,
mod4,
mod5,
mod6,
mod7,
mod8,
mod9,
mod10,
mod11.0,
mod11.1,
test = "F")
## Analysis of Deviance Table
##
## Model 1: mean_pot_count ~ s(distance, k = 5)
## Model 2: mean_pot_count ~ sum_rain + s(distance, k = 5)
## Model 3: mean_pot_count ~ mws + s(distance, k = 5)
## Model 4: mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
## Model 5: mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
## Model 6: mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
## Model 7: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
## Model 8: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 9: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
## Model 10: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
## Model 11: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 12: mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 329 248
## 2 328 247 1.000008 1 2.11 0.1470
## 3 328 237 0.000354 11 83105.02 0.0000000000095 ***
## 4 327 234 1.000059 2 5.90 0.0157 *
## 5 328 247 -1.000413 -13 35.30 0.0000000072711 ***
## 6 324 204 3.972107 43 29.67 < 0.0000000000000002 ***
## 7 325 208 -1.347884 -4 7.66 0.0026 **
## 8 324 204 1.347362 4 7.66 0.0026 **
## 9 325 204 -0.972597 0
## 10 324 204 0.999445 0 0.05 0.8218
## 11 324 639 0.343094 -435
## 12 325 660 -1.162338 -21 49.64 0.0000000000005 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.0_vis <- getViz(mod11.0)
check(mod11.0_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0003024,-0.000000006342]
## (score 309.5 & scale 0.3639).
## Hessian positive definite, eigenvalue range [0.4238,2979].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.00 3.50 0.86 0.010 **
## s(mws) 4.00 1.98 0.89 0.045 *
## s(sum_rain) 4.00 2.86 0.89 0.040 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.1_vis <- getViz(mod11.1)
check(mod11.1_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 11 iterations.
## Gradient range [-0.00007463,0.00003687]
## (score 315.5 & scale 0.3647).
## Hessian positive definite, eigenvalue range [0.00007462,2980].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.000000 3.248778 0.84 0.005 **
## s(mws) 4.000000 3.354524 0.86 0.005 **
## s(sum_rain) 4.000000 0.000176 0.87 0.005 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model, mod11.0, mean_pot_count ~ s(Distance) + s(Windspeed) + s(Precipitation) - family = tw(), is the best performing model. It cannot be used for predictions, but suitably describes the dispersal data we have on hand with the parameters used. More data would be desireable to increase the value of k as evidenced in the GAM checks.